In [32]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/netflix-shows/netflix_titles.csv

Exploratory Data Analysis¶

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Loading the Data¶

In [34]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

Visualising using Ydata_profiling¶

In [35]:
from ydata_profiling import ProfileReport
/opt/conda/lib/python3.10/site-packages/numba/core/decorators.py:262: NumbaDeprecationWarning: numba.generated_jit is deprecated. Please see the documentation at: https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-generated-jit for more information and advice on a suitable replacement.
  warnings.warn(msg, NumbaDeprecationWarning)
/opt/conda/lib/python3.10/site-packages/visions/backends/shared/nan_handling.py:51: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def hasna(x: np.ndarray) -> bool:
In [36]:
ProfileReport(df)
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[36]:

In [37]:
df.head()
Out[37]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 s1 Movie Dick Johnson Is Dead Kirsten Johnson NaN United States September 25, 2021 2020 PG-13 90 min Documentaries As her father nears the end of his life, filmm...
1 s2 TV Show Blood & Water NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries After crossing paths at a party, a Cape Town t...
2 s3 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... NaN September 24, 2021 2021 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Act... To protect his family from a powerful drug lor...
3 s4 TV Show Jailbirds New Orleans NaN NaN NaN September 24, 2021 2021 TV-MA 1 Season Docuseries, Reality TV Feuds, flirtations and toilet talk go down amo...
4 s5 TV Show Kota Factory NaN Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... India September 24, 2021 2021 TV-MA 2 Seasons International TV Shows, Romantic TV Shows, TV ... In a city of coaching centers known to train I...
In [38]:
df.isnull().sum()
Out[38]:
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64
In [39]:
df['rating'].unique()
Out[39]:
array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
       'TV-Y7-FV', 'UR'], dtype=object)
In [40]:
# replacing missing country values with the most common country
mode_country = df['country'].mode()[0]
df['country'].fillna(mode_country, inplace=True)
In [41]:
ax = sns.countplot(x=df.type,palette='rocket')
ax.bar_label(ax.containers[0])
ax.set_title('Number of Movies and TV Shows on Netflix')
Out[41]:
Text(0.5, 1.0, 'Number of Movies and TV Shows on Netflix')
In [42]:
# Group the data by 'country' and count the number of occurrences
country_counts = df['country'].value_counts()

# Select the top N countries for better visualization (adjust as needed)
top_countries = country_counts.head(10)

# Plotting the bar chart
plt.figure(figsize=(10, 6))
top_countries.plot(kind='bar')
plt.title('Amount of Content per Country')
plt.xlabel('Country')
plt.ylabel('Number of Shows/Movies')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()
In [43]:
type_count = df['type'].value_counts()

# Plotting the bar chart
plt.figure(figsize=(5, 6))
type_count.plot(kind='bar')
plt.title('Most common type of content')
plt.xlabel('Country')
plt.ylabel('Number of Shows/Movies')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()
In [44]:
# Create a new column 'genres' by splitting the 'listed_in' values
df['genres'] = df['listed_in'].str.split(', ')

# Flatten the list of genres
all_genres = [genre for sublist in df['genres'].dropna() for genre in sublist]
In [45]:
# Count the occurrences of each genre
genre_counts = pd.Series(all_genres).value_counts()
In [46]:
# Display the most common genre
most_common_genre = genre_counts.idxmax()
print(f"The most common genre for a movie or TV show is: {most_common_genre}")
The most common genre for a movie or TV show is: International Movies
In [47]:
# Create a new column 'genres' by splitting the 'listed_in' values
df['cast_list'] = df['cast'].str.split(', ')

# Flatten the list of genres
cast_list = [genre for sublist in df['cast_list'].dropna() for genre in sublist]

# Count the occurrences of each genre
cast_list_count = pd.Series(cast_list).value_counts()
In [48]:
most_casted_actor = cast_list_count.idxmax()
print(f"The most common genre for a movie or TV show is: {most_casted_actor}")
The most common genre for a movie or TV show is: Anupam Kher
In [49]:
df.columns
Out[49]:
Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'genres', 'cast_list'],
      dtype='object')
In [50]:
df['duration']
Out[50]:
0          90 min
1       2 Seasons
2        1 Season
3        1 Season
4       2 Seasons
          ...    
8802      158 min
8803    2 Seasons
8804       88 min
8805       88 min
8806      111 min
Name: duration, Length: 8807, dtype: object
In [51]:
tv_shows = df[df['type'] == 'TV Show']
movies = df[df['type'] == 'Movies']
In [52]:
tv_shows['duration'] = pd.to_numeric(df['duration'].str.extract('(\d+)', expand=False), errors='coerce')
/tmp/ipykernel_138/625843817.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tv_shows['duration'] = pd.to_numeric(df['duration'].str.extract('(\d+)', expand=False), errors='coerce')
In [53]:
longest_running_show = tv_shows.loc[tv_shows['duration'].idxmax()]
In [54]:
print(f"Longest running show: {longest_running_show['title']}")
Longest running show: Grey's Anatomy
In [56]:
movies['duration'] = pd.to_numeric(df['duration'].str.extract('(\d+)', expand=False), errors='coerce')
In [59]:
fig, ax = plt.subplots(figsize =(12,8))
ax = sns.countplot(y=df.rating,order = df.rating.value_counts().index,palette='rocket')
ax.bar_label(ax.containers[0])
ax.set_title('Number of ratings')
Out[59]:
Text(0.5, 1.0, 'Number of ratings')
In [60]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       8807 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
 12  genres        8807 non-null   object
 13  cast_list     7982 non-null   object
dtypes: int64(1), object(13)
memory usage: 963.4+ KB
In [61]:
df['date_added'] = pd.to_datetime(df['date_added'],format = 'mixed')
df['month_added'] = df['date_added'].dt.month
df['year_added'] = df['date_added'].dt.year
df.head(3)
Out[61]:
show_id type title director cast country date_added release_year rating duration listed_in description genres cast_list month_added year_added
0 s1 Movie Dick Johnson Is Dead Kirsten Johnson NaN United States 2021-09-25 2020 PG-13 90 min Documentaries As her father nears the end of his life, filmm... [Documentaries] NaN 9.0 2021.0
1 s2 TV Show Blood & Water NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa 2021-09-24 2021 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries After crossing paths at a party, a Cape Town t... [International TV Shows, TV Dramas, TV Mysteries] [Ama Qamata, Khosi Ngema, Gail Mabalane, Thaba... 9.0 2021.0
2 s3 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... United States 2021-09-24 2021 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Act... To protect his family from a powerful drug lor... [Crime TV Shows, International TV Shows, TV Ac... [Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nab... 9.0 2021.0

Replacing Unexpected Values¶

In [72]:
df.rating.value_counts()
df.rating.replace(['74 min', '84 min', '66 min'], 'TV-MA',inplace=True)

Replacing with Correct rating¶

In [73]:
df.loc[[5989], ['rating']] = 'TV-PG'
df.loc[[6827], ['rating']] = 'TV-14'
df.loc[[7312], ['rating']] = 'TV-PG'
df.loc[[7537], ['rating']] = 'PG-13'

Changing Rating According to Netflix¶

In [74]:
df.loc[df.rating.isin(['TV-Y7-FV']), ['rating']] = 'TV-Y7'
df.loc[df.rating.isin(['TV-G']), ['rating']] = 'G'
df.loc[df.rating.isin(['TV-PG']), ['rating']] = 'PG'
df.loc[df.rating.isin(['TV-MA']), ['rating']] = 'R'
df.loc[df.rating.isin(['NR', 'UR']), ['rating']] = 'nrur'

Plotting Ratings After processing¶

In [75]:
fig, ax = plt.subplots(figsize =(12,8))
ax = sns.countplot(y=df.rating,order = df.rating.value_counts().index,palette='rocket')
ax.bar_label(ax.containers[0])
ax.set_title('Number of ratings')
Out[75]:
Text(0.5, 1.0, 'Number of ratings')

Creating a New column Maturity Level¶

In [76]:
#create a new column to catergory kid, teen, adult
kid = df[df.rating.isin(['TV-Y','TV-Y7','G','PG'])].index
teen = df[df.rating.isin(['PG-13','TV-14'])].index
adult = df[df.rating.isin(['R','NC-17','nrur'])].index
df.loc[kid, 'maturity_level'] = 'kid'
df.loc[teen, 'maturity_level'] = 'teen'
df.loc[adult, 'maturity_level'] = 'adult'

Ploting Maturity and Number of shows divided¶

In [78]:
fig, ax = plt.subplots(figsize =(12,8))
ax = sns.countplot(y=df.maturity_level,palette='rocket')
ax.bar_label(ax.containers[0])
ax.set_title('Number of movies')
Out[78]:
Text(0.5, 1.0, 'Number of movies')

Implementation of Algorithms¶

In [31]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from fuzzywuzzy import process

# Function to preprocess and create a content column
def preprocess_data(df):
    
    
    
    # replacing missing country values with the most common country
    df.rating.replace(['74 min', '84 min', '66 min'], 'TV-MA',inplace=True)
    
    #replacing with 
    mode_country = df['country'].mode()[0]
    df['country'].fillna(mode_country, inplace=True)
    
    # replacing correct rating 
    df.loc[[5989], ['rating']] = 'TV-PG'
    df.loc[[6827], ['rating']] = 'TV-14'
    df.loc[[7312], ['rating']] = 'TV-PG'
    df.loc[[7537], ['rating']] = 'PG-13'

    #Designing according to the Netflix algorithm
    df.loc[df.rating.isin(['TV-Y7-FV']), ['rating']] = 'TV-Y7'
    df.loc[df.rating.isin(['TV-G']), ['rating']] = 'G'
    df.loc[df.rating.isin(['TV-PG']), ['rating']] = 'PG'
    df.loc[df.rating.isin(['TV-MA']), ['rating']] = 'R'
    df.loc[df.rating.isin(['NR', 'UR']), ['rating']] = 'nrur'
    
    #Setting Maturity Levels for kids, Teens, Adults
    kid = df[df.rating.isin(['TV-Y','TV-Y7','G','PG'])].index
    teen = df[df.rating.isin(['PG-13','TV-14'])].index
    adult = df[df.rating.isin(['R','NC-17','nrur'])].index
    df.loc[kid, 'maturity_level'] = 'kid'
    df.loc[teen, 'maturity_level'] = 'teen'
    df.loc[adult, 'maturity_level'] = 'adult'
                      
    # combining all the contents making a big string of knowledge
    df['content'] = df['title'].astype(str) + ' ' + df['director'].astype(str) + ' ' + df['cast'].astype(str) + ' ' + df['country'].astype(str) + ' ' + df['rating'].astype(str) + df['duration'].astype(str) + ' ' + df['listed_in'].astype(str) + ' ' + df['description'].astype(str)  + ' ' + df['maturity_level'].astype(str)
    df['content'] = df['content'].fillna('')
    

# Function to create TF-IDF matrix
def create_tfidf_matrix(df):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['content'])
    return tfidf_matrix

# Function to create Bag of Words (BoW) matrix
def create_bow_matrix(df):
    count_vectorizer = CountVectorizer()
    bow_matrix = count_vectorizer.fit_transform(df['content'])
    return bow_matrix

# Function to compute TF-IDF cosine similarity
def tfidf_cosine_similarity(tfidf_matrix):
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

# Function to compute BoW cosine similarity
def bow_cosine_similarity(bow_matrix):
    cosine_sim = cosine_similarity(bow_matrix)
    return cosine_sim

# Function to train Word2Vec model
def train_word2vec(df):
    df['tokenized_content'] = df['content'].apply(simple_preprocess)
    model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
    model.build_vocab(df['tokenized_content'])
    model.train(df['tokenized_content'], total_examples=model.corpus_count, epochs=10)
    return model

# Function to average word vectors for a text
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    return np.array(features)

# Function to compute Word2Vec-based similarity
def word2vec_similarity(user_movie, df):
    user_movie = find_similar_movies_fuzzy(df, user_movie)
    df['tokenized_content'] = df['content'].apply(simple_preprocess)
    model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
    model.build_vocab(df['tokenized_content'])
    model.train(df['tokenized_content'], total_examples=model.corpus_count, epochs=10)
    movie_index = df[df['title'] == user_movie].index[0]
    w2v_feature_array = averaged_word_vectorizer(corpus=df['tokenized_content'], model=model, num_features=100)

    # Compute the cosine similarities between the user movie and all other movies
    user_movie_vector = w2v_feature_array[movie_index].reshape(1, -1)
    similarity_scores = cosine_similarity(user_movie_vector, w2v_feature_array)

    # Get the top 10 most similar movies
    similar_movies = list(enumerate(similarity_scores[0]))
    sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]

    # Print the top 10 similar movies
    for i, score in sorted_similar_movies:
        print("{}: {}".format(i, df.loc[i, 'title']))

    


# Function to compute BoW-based similarity
def bow_similarity(user_movie, df, bow_matrix):
    user_movie = find_similar_movies_fuzzy(df, user_movie)
    movie_index = df[df['title'] == user_movie].index[0]
    similarity_scores = bow_cosine_similarity(bow_matrix)
    similar_movies = list(enumerate(similarity_scores[movie_index]))
    sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]
    for i, score in sorted_similar_movies:
        print("{}: {}".format(i, df.loc[i, 'title']))

# Function to compute TF-IDF-based similarity
def tfidf_similarity(user_movie, df, tfidf_matrix):
    user_movie = find_similar_movies_fuzzy(df, user_movie)
    movie_index = df[df['title'] == user_movie].index[0]
    similarity_scores = tfidf_cosine_similarity(tfidf_matrix)
    similar_movies = list(enumerate(similarity_scores[movie_index]))
    sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]
    for i, score in sorted_similar_movies:
        print("{}: {}".format(i, df.loc[i, 'title']))

# Function to find similar movies using fuzzy string matching
def find_similar_movies_fuzzy(df, movie_name):
    top_movies = process.extract(movie_name, df['title'], limit=5)
    return top_movies[0][0]

def similar_movies_fuzzy(df, movie_name):
    top_movies = process.extract(movie_name, df['title'], limit=5)
    print("Advanced Search and similar Alternatives")
    for movie, score, index in top_movies:
        print(f"Movie: {movie}, Similarity Score: {score}")
    

# Main program
if __name__ == "__main__":
    # Load your DataFrame 'df' with movie data here
    df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

    # Preprocess data and create the 'content' column
    preprocess_data(df)

    # Create the TF-IDF matrix and BoW matrix
    tfidf_matrix = create_tfidf_matrix(df)
    bow_matrix = create_bow_matrix(df)

    # Train Word2Vec model
    #word2vec_model = train_word2vec(df)

    # Get user input
    user_movie = input("Enter a movie title: ")
    
    similar_movies_fuzzy(df, user_movie)
    
    print("\nSimilar Movies (TF-IDF Cosine Similarity):")
    tfidf_similarity(user_movie, df, tfidf_matrix)

    print("\nSimilar Movies (BoW Cosine Similarity):")
    bow_similarity(user_movie, df, bow_matrix)

    print("\nSimilar Movies (Word2Vec Similarity):")
    similarity_scores = word2vec_similarity(user_movie, df)
    
Advanced Search and similar Alternatives
Movie: Superbad, Similarity Score: 100
Movie: AdĂș, Similarity Score: 90
Movie: P, Similarity Score: 90
Movie: Superstar, Similarity Score: 71
Movie: Esperando la carroza, Similarity Score: 68

Similar Movies (TF-IDF Cosine Similarity):
4938: Seth Rogen's Hilarity for Charity
178: The Interview
3305: Seth Meyers: Lobby Baby
5900: Wet Hot American Summer
2010: How to Train Your Dragon 2
346: Pineapple Express
5540: Win It All
5035: Dragons: Race to the Edge
6710: Evan Almighty
145: House Party 2
1833: ParaNorman
7515: Movie 43
4289: Dragons: Dawn of the Dragon Racers
2454: The Disaster Artist
4718: Like Father
5133: Trolls Holiday Special
6414: Can't Hardly Wait
4629: Maniac
1443: QB1: Beyond the Lights

Similar Movies (BoW Cosine Similarity):
145: House Party 2
7498: Monster High: Haunted
7121: Jay and Silent Bob Strike Back
5833: Brahman Naman
8401: The Longest Yard
1830: What Did I Mess
3314: 100 Things to do Before High School
648: Too Hot to Handle
8630: Trip to Bhangarh: Asia's Most Haunted Place
3912: Generation Iron 3
6563: Dare to Be Wild
4050: Weapon of Choice
8804: Zombieland
3629: Otherhood
7585: Nightcrawler
8714: Welcome to Monster High: The Origin Story
7715: Patron Mutlu Son Istiyor
8624: Tremors 4: The Legend Begins
7046: I Fine... Thank You... Love You

Similar Movies (Word2Vec Similarity):
6533: Cool Hand Luke
8608: Total Frat Movie
622: Lying and Stealing
956: Zack and Miri Make a Porno
1510: The Con Is On
5329: Chocolate City: Vegas Strip
6321: Black & Privileged: Volume 1
7579: New York Minute
7369: Mad Money
48: Training Day
6637: Doubt
142: Freedom Writers
630: Killing Them Softly
1367: Hell Fest
1034: Synchronic
6896: Green Room
5419: You Get Me
8046: Sniper: Special Ops
5830: Rebirth